1 Setup

setwd("/mnt/picea/projects/algae/cfunk/algal-acclimatization/Salmon")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_delim(file = "~/Git/UPSCb/projects/algal-acclimatization/doc/Samples.tsv", delim="\t") %>% 
  mutate(Time=relevel(factor(Time),"std")) %>% 
  mutate(Replicate=factor(Replicate))
## Parsed with column specification:
## cols(
##   SampleID = col_character(),
##   Time = col_character(),
##   Replicate = col_character()
## )

2 Raw data

filelist <- list.files(".", 
                    recursive = TRUE, 
                    pattern = "quant.sf",
                    full.names = TRUE)

Select the samples containing fungi

names(filelist) <- basename(dirname(filelist))

stopifnot(all(names(filelist) %in% samples$SampleID))

filelist <- filelist[match(samples$SampleID,names(filelist))]

Read the transcript expression

tx <- suppressMessages(tximport(files = filelist, 
                                type = "salmon",
                                txOut=TRUE))

counts <- round(tx$counts)

2.1 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s transcripts are probable chimeraes",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "5.4% percent (15353) of 285908 transcripts are probable chimeraes"
  • Sequencing depth The sequencing depth is very similar between samples, around 15M reads
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples),
       aes(x,y,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is shifted towards low expression, which may be due to both technical (chimearaes) and biological (e.g. bacteria) aspects

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 15353 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Time=samples$Time[match(ind,samples$SampleID)])
  • Color by Time We observe no bias for any given time point and all samples behave similarly
ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 4669330 rows containing non-finite values (stat_density).

2.2 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/raw-unormalised-gene-expression_data.csv")

2.3 Data normalisation

2.3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ Time)
## converting counts to integer mode
save(dds,file="../analysis/salmon/dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is not variable -/+ 25%

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
B2_2_120hrs_C B2_2_12hrs_B B2_2_120hrs_A B2_2_120hrs_B B2_2_std_B
1.179 0.9015 1.1 1.078 1.103
Table continues below
B2_2_120hrs_D B2_2_12hrs_D B2_2_24hrs_B B2_2_4hrs_A B2_2_24hrs_A
1.017 1.286 1.245 0.8279 1.023
Table continues below
B2_2_24hrs_C B2_2_12hrs_C B2_2_4hrs_D B2_2_24hrs_D B2_2_60minD
1.13 0.8508 0.7608 0.9764 1.254
Table continues below
B2_2_60minB B2_2_72hrs_B B2_2_4hrs_C B2_2_60minA B2_2_72hrs_A
1.353 1.231 0.8076 1.039 1.175
Table continues below
B2_2_std_D B2_2_4hrs_B B2_2_std_A B2_2_60minC B2_2_std_C
1.312 0.7357 1.237 1.148 1.103
B2_2_12hrs_A B2_2_72hrs_D B2_2_72hrs_C
0.9302 0.9419 0.8654
boxplot(sizes, main="Sequencing libraries size factor")

2.4 Variance Stabilising Transformation

The transformation is done blind (ignoring the experimental design), as we are performing the quality assessment.

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)

The normalised values are on an approx. log2 scale

vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked but not optimally, as we have a lot of very lowly expressed (presumably artefactual) transcripts. On the other hand, transcripts that are more highly expressed show a relatively constant variance. There is also evidence that some transcripts display a large variance. If that variance is linked to the study design, we will have sufficient power to detect differential expression. To assess that we can perform a Principal Component Analysis (PCA).

meanSdPlot(vst[rowSums(counts)>0,])

2.5 QC on the normalised data

2.5.1 PCA

The PCA is conducted by sample, hence the matrix transposition below.

pc <- prcomp(t(vst))
  
percent <- round(summary(pc)$importance[2,]*100)

2.5.2 3 first dimensions

This looks interesting as the sample separate clearly Time There may be a sample swap between 12 and 24 hours

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)],pch=19)
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))

par(mar=mar)

2.5.3 2D

In all likelihood, B2_2_12hrs_D and B2_2_24hrs_D have been sample-swapped

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples)

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,text=SampleID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")


ggplotly(p) %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
                       yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

2.5.4 Heatmap

Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA

sels <- rangeFeatureSelect(counts=vst,
                   conditions=samples$Time,
                   nrep=3)

Let’s zoom in a bit

pander(sapply(sels,sum))

285908, 152412, 106165, 78841, 60702, 46744, 34705, 24644, 16434, 10092, 5562, 2699, 1182, 485, 174, 55, 14, 3 and 1

  • Heatmap of “all” genes We know that a lot of transcripts have low expression and may be artefactual.

As such, we select an higher cutoff (8)

ctf=8

Taking into account all the genes (above that noise threshold), the data clearly cluster according to the time, in two main clusters, an early (std until 12h) and a late one (24 to 120h). The late cluster is relatively homogeneous with almost no change between 72 and 120h. Here, the sample swap is very visible.

heatmap.2(t(scale(t(vst[sels[[ctf]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = samples$Time,
          col=hpal)

2.6 Conclusion

The quality of the data is good. The PCA shows that the samples cluster bytime, however, there has been in all likelihood a sample swap.

3 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.52.0                  tximport_1.12.3            
##  [3] forcats_0.4.0               stringr_1.4.0              
##  [5] dplyr_0.8.3                 purrr_0.3.2                
##  [7] readr_1.3.1                 tidyr_0.8.3                
##  [9] tibble_2.1.3                tidyverse_1.2.1            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] plotly_4.9.0                pander_0.6.3               
## [15] LSD_4.0-0                   hyperSpec_0.99-20180627    
## [17] ggplot2_3.2.1               lattice_0.20-38            
## [19] gplots_3.0.1.1              DESeq2_1.24.0              
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
## [23] BiocParallel_1.18.1         matrixStats_0.54.0         
## [25] Biobase_2.44.0              GenomicRanges_1.36.0       
## [27] GenomeInfoDb_1.20.0         IRanges_2.18.2             
## [29] S4Vectors_0.22.0            BiocGenerics_0.30.0        
## [31] data.table_1.12.2          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       htmlTable_1.13.1       XVector_0.24.0        
##   [4] base64enc_0.1-3        rstudioapi_0.10        hexbin_1.27.3         
##   [7] affyio_1.54.0          bit64_0.9-7            AnnotationDbi_1.46.1  
##  [10] lubridate_1.7.4        xml2_1.2.2             splines_3.6.1         
##  [13] geneplotter_1.62.0     knitr_1.24             zeallot_0.1.0         
##  [16] Formula_1.2-3          jsonlite_1.6           broom_0.5.2           
##  [19] annotate_1.62.0        cluster_2.1.0          shiny_1.3.2           
##  [22] BiocManager_1.30.4     compiler_3.6.1         httr_1.4.1            
##  [25] backports_1.1.4        assertthat_0.2.1       Matrix_1.2-17         
##  [28] lazyeval_0.2.2         limma_3.40.6           cli_1.1.0             
##  [31] later_0.8.0            acepack_1.4.1          htmltools_0.3.6       
##  [34] tools_3.6.1            affy_1.62.0            gtable_0.3.0          
##  [37] glue_1.3.1             GenomeInfoDbData_1.2.1 reshape2_1.4.3        
##  [40] Rcpp_1.0.2             cellranger_1.1.0       vctrs_0.2.0           
##  [43] preprocessCore_1.46.0  gdata_2.18.0           nlme_3.1-141          
##  [46] crosstalk_1.0.0        xfun_0.9               testthat_2.2.1        
##  [49] rvest_0.3.4            mime_0.7               gtools_3.8.1          
##  [52] XML_3.98-1.20          zlibbioc_1.30.0        scales_1.0.0          
##  [55] promises_1.0.1         hms_0.5.1              yaml_2.2.0            
##  [58] memoise_1.1.0          gridExtra_2.3          rpart_4.1-15          
##  [61] latticeExtra_0.6-28    stringi_1.4.3          RSQLite_2.1.2         
##  [64] highr_0.8              genefilter_1.66.0      checkmate_1.9.4       
##  [67] caTools_1.17.1.2       rlang_0.4.0            pkgconfig_2.0.2       
##  [70] bitops_1.0-6           evaluate_0.14          labeling_0.3          
##  [73] htmlwidgets_1.3        bit_1.1-14             tidyselect_0.2.5      
##  [76] plyr_1.8.4             magrittr_1.5           R6_2.4.0              
##  [79] generics_0.0.2         Hmisc_4.2-0            DBI_1.0.0             
##  [82] pillar_1.4.2           haven_2.1.1            foreign_0.8-72        
##  [85] withr_2.1.2            survival_2.44-1.1      RCurl_1.95-4.12       
##  [88] nnet_7.3-12            modelr_0.1.5           crayon_1.3.4          
##  [91] KernSmooth_2.23-15     rmarkdown_1.15         locfit_1.5-9.1        
##  [94] readxl_1.3.1           blob_1.2.0             digest_0.6.20         
##  [97] xtable_1.8-4           httpuv_1.5.1           munsell_0.5.0         
## [100] viridisLite_0.3.0